home *** CD-ROM | disk | FTP | other *** search
- /* WIDE AREA INFORMATION SERVER SOFTWARE:
- No guarantees or restrictions. See the readme file for the full standard
- disclaimer.
-
- */
-
-
- /* retrieval part of the serial ir engine. if you are using a different
- storage system for the documents, replace this file. */
-
- #include "irretrvl.h"
- #include <string.h>
- #include "futil.h"
- #include <ctype.h> /* for isspace */
-
- /*----------------------------------------------------------------------*/
-
-
- static boolean parseDocID
- _AP((DocObj* doc,char* filename,long* start_character,
- long* end_character,long* errorCode));
-
- static boolean
- parseDocID(doc,filename,start_character,end_character,errorCode)
- DocObj* doc;
- char* filename;
- long* start_character;
- long* end_character;
- long* errorCode;
- {
- DocID* theDocID = NULL;
- char* local_id = NULL;
- char* token = NULL;
- long i;
-
- theDocID = docIDFromAny(doc->DocumentID);
-
- local_id = anyToString(GetLocalID(theDocID));
-
- freeDocID(theDocID);
-
- /* parse the doc id into start pos, end pos, and filename */
- /* first the start char */
- token = local_id;
- for (i = 0; local_id[i] != '\0' && isspace(local_id[i]) == false; i++)
- ;
- if (local_id[i] == '\0')
- {
- waislog(WLOG_HIGH, WLOG_ERROR,
- "Attempt to retrieve data for bad doc-id: '%s'",local_id);
- *errorCode = GDT_BadDocID;
- s_free(local_id);
- return(false);
- }
- local_id[i] = '\0';
- sscanf(token,"%ld",start_character);
- /* now the second char */
- token = local_id + i + 1;
- for (++i; local_id[i] != '\0' && isspace(local_id[i]) == false; i++)
- ;
- if (local_id[i] == '\0')
- {
- waislog(WLOG_HIGH, WLOG_ERROR,
- "attempt to retrieve data for bad doc-id: '%s'",
- local_id);
- *errorCode = GDT_BadDocID;
- s_free(local_id);
- return(false);
- }
- local_id[i] = '\0';
- sscanf(token,"%ld",end_character);
- /* and finally the file name */
- strncpy(filename,local_id + i + 1,MAX_FILENAME_LEN);
- s_free(local_id);
- return(true);
- }
-
-
- /*----------------------------------------------------------------------*/
-
- WAISDocumentText* getData(doc, databaseName, errorCode)
- DocObj* doc;
- char *databaseName;
- long* errorCode;
- /* it isn't text, so we can just grab data */
- {
- FILE* file = NULL;
- char fileName[MAX_FILENAME_LEN + 1];
- WAISDocumentText* data = NULL;
- long start,end; /* position of the document in the file */
- long startByte,endByte,bytes,bytesRead; /* part of the doc that we want */
- char* buffer = NULL;
- any* bufAny = NULL;
-
- /* we can only handle byte chunks here */
- if ((doc->ChunkCode == CT_byte) ||
- (doc->ChunkCode == CT_document)) {
- if (parseDocID(doc,fileName,&start,&end,errorCode) == false)
- return(NULL);
-
- file = s_fopen(fileName,"rb");
- if (file == NULL)
- {
- waislog(WLOG_HIGH, WLOG_ERROR,
- "attempt to retrieve data for missing doc-id: '%s'",
- fileName);
- *errorCode = GDT_MissingDocID;
- return(NULL);
- }
-
- if (doc->ChunkCode == CT_byte) {
- startByte = doc->ChunkStart.Pos + start;
- endByte = doc->ChunkEnd.Pos + start;
- }
- else {
- startByte = start;
- endByte = end;
- }
-
- waislog(WLOG_LOW, WLOG_RETRIEVE,
- "Retrieving DocID: %d %d %s, byte: %d %d, from database %s",
- start, end, fileName, startByte, endByte, databaseName);
-
- if (endByte > end && end != 0)
- {
- waislog(WLOG_HIGH, WLOG_ERROR,
- "retrieval beyond bounds of document %ld in file <%s>",
- endByte,fileName);
- *errorCode = GDT_BadRange;
- return(NULL);
- }
-
- /* get the bytes */
- if (fseek(file,startByte,SEEK_SET) != 0)
- {
- waislog(WLOG_HIGH, WLOG_ERROR,
- "retrieval can't seek to %ld in file <%s>",startByte,
- fileName);
- *errorCode = GDT_BadRange;
- return(NULL);
- }
-
- bytes = endByte - startByte;
- buffer = (char*)s_malloc(bytes);
-
- bytesRead = fread((void*)buffer,(size_t)sizeof(char),bytes,file);
-
- if (bytesRead != bytes)
- {
- waislog(WLOG_HIGH, WLOG_ERROR,
- "retrieval error in file <%s>",fileName);
- *errorCode = GDT_BadRange;
- return(NULL);
- }
-
- bufAny = makeAny(bytesRead,buffer);
-
- data = makeWAISDocumentText(duplicateAny(doc->DocumentID),0L,bufAny);
-
- /* the any and the buffer are freed by freeWAISSearchResponse() */
- s_fclose(file);
-
- *errorCode = GDT_NoError;
-
- return(data);
- }
- else
- {
- waislog(WLOG_HIGH, WLOG_ERROR,
- "search engine can only use whole documents or byte offsets for data lookup");
- *errorCode = GDT_UnsupportedChunkType;
- return(NULL);
- }
-
- }
-
- /*----------------------------------------------------------------------*/
-
- #define BUFSZ (size_t)5000
-
- WAISDocumentText* getDocumentText(doc, databaseName, errorCode)
- DocObj* doc;
- char *databaseName;
- long* errorCode;
- /* find the text for doc, get the sub part if any, finally construct and
- return a WAISDocumentText. If it can not find the document
- (or some other error) it returns NULL and sets errorCode.
- */
- {
- WAISDocumentText* text = NULL;
- FILE* file = NULL;
- char* buffer = NULL;
- any* bufAny = NULL;
- char filename[MAX_FILENAME_LEN + 1];
- long start_character;
- long end_character;
- register long i;
- long bytes,bytesRead;
- long startByte,endByte,byte,lines;
-
- /* we can only handle line chunks for now */
- if (doc->ChunkCode != CT_line)
- {
- waislog(WLOG_HIGH, WLOG_ERROR,
- "search engine can only use line offsets for now.");
-
- *errorCode = GDT_UnsupportedChunkType;
- return(NULL);
- }
-
- if (parseDocID(doc,filename,&start_character,&end_character,errorCode) ==
- false)
- return(NULL);
-
- waislog(WLOG_LOW, WLOG_RETRIEVE,
- "Retrieving DocID: %d %d %s, line range: %d %d, from database %s",
- start_character, end_character, filename,
- doc->ChunkStart.Pos, doc->ChunkEnd.Pos,
- databaseName);
- /* check the database */
- if(NULL == databaseName){
- *errorCode = GDT_MissingDatabase;
- return(NULL);
- }
-
-
- file = s_fopen(filename,"r");
- if (file == NULL)
- {
- waislog(WLOG_HIGH, WLOG_ERROR,
- "attempt to retrieve text for bad doc-id: '%s'",
- doc->DocumentID);
-
- *errorCode = GDT_MissingDocID;
- return(NULL);
- }
-
- if(0 != fseek(file, start_character, SEEK_SET))
- {
- waislog(WLOG_HIGH, WLOG_ERROR,
- " error on attempt to seek into file");
-
- *errorCode = GDT_MissingDocID;
- return(NULL);
- }
- /* find the start byte */
- buffer = (char*)s_malloc(BUFSZ);
- lines = byte = 0;
- while (lines < doc->ChunkStart.Pos)
- { /* search a buffer full */
- bytesRead = fread(buffer,(size_t)sizeof(char),BUFSZ,file);
- for (i = 0; i < bytesRead && lines < doc->ChunkStart.Pos; i++, byte++)
- { if (buffer[i] == '\n' || buffer[i] == '\r')
- /* \r should not happen because we are reading the file in text
- mode */
- lines++;
- }
- if (bytesRead == 0) /* cheasy handling files that don't end with nl */
- lines++;
- }
- startByte = byte;
-
- beFriendly();
-
- /* find the end byte */ /* this could be done while getting the bytes XXX */
- /* search starting form the start pos */
- if (fseek(file,startByte + start_character,SEEK_SET) != 0)
- {
- waislog(WLOG_HIGH, WLOG_ERROR,
- "retrieval can't seek to %ld in file <%s>",
- startByte,filename);
-
- *errorCode = GDT_BadRange;
- return(NULL);
- }
-
- beFriendly();
-
- while (lines < doc->ChunkEnd.Pos)
- { /* search a buffer full */
- bytesRead = fread(buffer,(size_t)sizeof(char),BUFSZ,file);
- for (i = 0; i < bytesRead && lines < doc->ChunkEnd.Pos; i++, byte++)
- { if (buffer[i] == '\n' || buffer[i] == '\r')
- /* \r should not happen, we are reading the file in text mode */
- lines++;
- }
- if (bytesRead == 0) /* cheasy handling of files that don't end with nl */
- lines++;
- }
- endByte = byte;
-
- beFriendly();
-
- s_free(buffer);
-
- /* get the bytes */
- if (fseek(file,startByte + start_character,SEEK_SET) != 0)
- {
- waislog(WLOG_HIGH, WLOG_ERROR,
- "retrieval can't seek to %ld in file <%s>",startByte,
- filename);
-
- *errorCode = GDT_BadRange;
- return(NULL);
- }
-
- bytes = endByte - startByte;
- buffer = (char*)s_malloc(bytes);
-
- bytesRead = fread((void*)buffer,(size_t)sizeof(char),bytes,file);
-
- if (bytesRead != bytes)
- {
- waislog(WLOG_HIGH, WLOG_ERROR,
- "retrieval error in file <%s>",filename);
-
- *errorCode = GDT_BadRange;
- return(NULL);
- }
-
- bufAny = makeAny(bytesRead,buffer);
-
- text = makeWAISDocumentText(duplicateAny(doc->DocumentID),0L,bufAny);
-
- /* the any and the buffer are freed by freeWAISSearchResponse() */
- s_fclose(file);
-
- *errorCode = GDT_NoError;
-
- return(text);
- }
-